In [177]:
#imports
import nltk
from nltk.text import Text, TextCollection
import pandas as pd
from utils.data.readCorpus import NltkCorpusFromDir
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"
In [178]:
#prepare the corpus
latinise = NltkCorpusFromDir(root="/home/krzys/Kod/streamlit/voces/data/corpora/latinise_IT_lemmas", fileids=r".*\.txt")
latinise_docs = []
for fileid in latinise.fileids():
    latinise_docs.append(Text(latinise.words(fileid)))
latinise_collection = TextCollection(latinise_docs)
In [179]:
# Terms
terms = ['gens', 'natio', 'civitas', 'populus', 'urbs']
# Fix colors for plotting purposes
color_discrete_map_terms = { term : px.colors.qualitative.Plotly[i] for i, term in enumerate(terms)}
In [180]:
#corpus metadata
filenames = latinise.fileids()
filenames = pd.DataFrame([(fname, fname.split('_')[2]) for fname in filenames], columns=["filename","id"])
metadata = pd.read_csv("/media/HOME_FOLDERS/krzys/Kod/lvlt22/BMG/latinise_metadata.csv", index_col="id")
metadata = metadata.merge(filenames,on="id")
metadata = metadata.drop_duplicates('id')
metadata = metadata.set_index('filename')
bins = range(-450,951,200)
labels = [ '-'.join([str(bin1), str(bin2)]) for bin1, bin2 in zip(bins[0:len(bins)-1],bins[1:len(bins)])]
metadata["period"] = pd.cut(metadata["date"], bins, labels=labels, include_lowest=True)
metadata["no_tokens"] = [  len(latinise.words(filename)) if filename in latinise.fileids() else 0 in filename  for filename in metadata.index.tolist() ]
In [181]:
# year by year frequency
dates = metadata["date"].unique()
cfd_year = nltk.ConditionalFreqDist()
for date in dates:
    condition = date
    for word in latinise.words(metadata[metadata["date"] == date].index):
        cfd_year[condition][word] +=1
In [182]:
#count tokens by year
freq_by_year = pd.DataFrame()
freq_by_year["year"] = pd.to_numeric(cfd_year.conditions())
freq_by_year["count"] = [ sum(cfd_year[year].values()) for year in freq_by_year["year"] ]
freq_by_year["period"] = pd.cut(freq_by_year["year"], bins=bins,labels=labels,include_lowest=True)
In [183]:
# count term frequency by year
terms_by_year = pd.DataFrame([ (year, term, counts[term]) for year, counts in cfd_year.items() for term in terms ],
                             columns = ["year", "term", "count"])
terms_by_year["year"] = pd.to_numeric(terms_by_year["year"])
#terms_by_year["term"].astype("category")
terms_by_year["count"] = pd.to_numeric(terms_by_year["count"])
terms_by_year["period"] = pd.cut(terms_by_year["year"],bins=bins,labels=labels,include_lowest=True)
terms_by_year.head()
Out[183]:
year term count period
0 -9 gens 0 -50-150
1 -9 natio 0 -50-150
2 -9 civitas 0 -50-150
3 -9 populus 10 -50-150
4 -9 urbs 7 -50-150
In [184]:
fig = px.scatter(terms_by_year,x="year", y="count", color="term")
fig.update_layout(title="Frequency of the terms by year")
fig.show()
In [185]:
# by period (ppm)
import numpy as np
freq_by_period = freq_by_year.groupby(["period"], as_index=False).agg({'count' : np.sum}) #all tokens by period
terms_by_period = terms_by_year.groupby(["term","period"], as_index=False).agg({'count' : np.sum})
counts_by_period = pd.merge(terms_by_period, freq_by_period, on=["period"], how="right")["count_y"]
terms_by_period["ppm"] = ( terms_by_period["count"] / counts_by_period ) * 1000000
terms_by_period.head()
Out[185]:
term period count ppm
0 civitas -450--250 0 0.000000e+00
1 civitas -250--50 395 5.316285e+05
2 civitas -50-150 1598 2.150740e+06
3 civitas 150-350 509 6.850606e+05
4 civitas 350-550 977 1.314939e+06
In [186]:
fig = px.line(terms_by_period,x="period", y="ppm", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by period (ppm)")
fig.show()
In [187]:
# by period (raw counts)
fig = px.bar(terms_by_period,x="period", y="count", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by period",barmode='stack')
fig.show()
In [188]:
# frequency by century
century = [-500,-400,-300,-200,-100,0,100,200,300,400,500,600,700,800,900,1000]
terms_by_year["century"] = pd.cut(terms_by_year["year"], bins=century, labels=range(-5,10,1))
terms_by_year.head()
Out[188]:
year term count period century
0 -9 gens 0 -50-150 -1
1 -9 natio 0 -50-150 -1
2 -9 civitas 0 -50-150 -1
3 -9 populus 10 -50-150 -1
4 -9 urbs 7 -50-150 -1
In [189]:
terms_by_century = terms_by_year.groupby(["century", "term"]).agg({'count':np.sum}).reset_index()
In [190]:
fig = px.line(terms_by_century,x="century", y="count", color="term", color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Frequency of the terms by century")
fig.show()
In [191]:
#by fileid - prepare freq distribution
import itertools
cfd_bytext = nltk.ConditionalFreqDist()
for file in [ x for x in latinise.fileids() ]:
    for word in latinise.words(file):
        cfd_bytext[file][word] +=1

terms_by_text = pd.DataFrame([ (filename, term, counts[term]) for filename, counts in cfd_bytext.items() for term in terms ],
                             columns = ["filename", "term", "count"])
terms_by_text = pd.merge(terms_by_text, metadata, on="filename")
terms_by_text['ppm'] = (terms_by_text['count'] / terms_by_text['no_tokens']) * 1000000
terms_by_text.head()

# top works by period - raw
top_terms_by_text = terms_by_text.sort_values('count', ascending=False).groupby(["period","term"]).head(5).sort_values("count", ascending=False)
In [192]:
fig = px.bar(top_terms_by_text,x="id", y="count", color="term", facet_col="period", facet_row="term", text="title", facet_col_wrap=2, category_orders={"period":labels}, 
             height=800, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period", uniformtext_minsize=10, uniformtext_mode='hide')
fig.update_xaxes(matches=None, visible=False)
fig.update_yaxes(matches=None)
fig.show()
In [193]:
# top works by period - ppm
top_terms_by_text = terms_by_text.sort_values('ppm', ascending=False).groupby(["period","term"]).head(5).sort_values("ppm", ascending=False)
fig = px.bar(top_terms_by_text,x="id", y="ppm", color="term", facet_col="period", facet_row="term", text="title", facet_col_wrap=2, category_orders={"period":labels}, 
             height=1200, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period (ppm)",
                  uniformtext_minsize=14)
fig.update_xaxes(matches=None, visible=False)
fig.update_traces(textposition='inside', textfont_size=16)
fig.show()
In [194]:
#top_terms_by_author
top_terms_by_author = terms_by_text.sort_values('count', ascending=False).groupby(["creator","term","period"], observed=True).agg({'count':np.sum}).reset_index().sort_values("count", ascending=False)
fig = px.bar(top_terms_by_author, x="creator", y="count", color="term", facet_col="period", facet_row="term", text="creator", facet_col_wrap=2, category_orders={"period":labels}, 
             height=800, hover_data=["creator", "period"], color_discrete_map=color_discrete_map_terms)

fig.update_layout(title="Authors with max number of terms by period", uniformtext_minsize=8)
fig.update_xaxes(matches=None, visible=False)
fig.update_traces(textposition='outside', textfont_size=14)
fig.update_yaxes(matches=None)
fig.show()
In [195]:
# terms by genre
top_terms_by_text.head()
top_terms_by_genre = terms_by_text.sort_values('count', ascending=False).groupby(["term", "type","period"]).head(5).sort_values("count", ascending=False)
top_terms_by_genre.head()
fig = px.bar(top_terms_by_genre,x="period", y="count", color="term", facet_col="period", facet_row="type", facet_col_wrap=2, category_orders={"period":labels}, 
             height=800, hover_data=["id", "creator", "title", "period"], color_discrete_map=color_discrete_map_terms)
fig.update_layout(title="Texts with max number of terms by period (raw)", uniformtext_minsize=8)
fig.update_xaxes(matches=None, visible=False)
fig.update_yaxes(matches=None)
fig.show()